package com.hp.hpl.sparta;

import java.io.*;


/**
 * An XML byte stream that has been parsed into a DOM tree.
 * Just like ParseCharStream except handle Unicode encoding of byte stream.
 * Use rules in
 * http://www.w3.org/TR/2000/REC-xml-20001006#sec-guessing to guess
 * encoding -- if encoding declaration is different, restart parsing.
 *
 * <blockquote><small> Copyright (C) 2002 Hewlett-Packard Company.
 * This file is part of Sparta, an XML Parser, DOM, and XPath library.
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public License
 * as published by the Free Software Foundation; either version 2.1 of
 * the License, or (at your option) any later version.  This library
 * is distributed in the hope that it will be useful, but WITHOUT ANY
 * WARRANTY; without even the implied warranty of MERCHANTABILITY or
 * FITNESS FOR A PARTICULAR PURPOSE.</small></blockquote>
 *
 * @author Eamonn O'Brien-Strain
 * @version $Date: 2003/07/28 04:33:04 $  $Revision: 1.5 $
 * @see <a "href="doc-files/LGPL.txt">GNU Lesser General Public License</a>
 */

class ParseByteStream implements ParseSource {

    /**
     * Parse XML document from byte stream, converting to Unicode
     * characters as specifed by the initial byte-order-mark.
     *
     * @param istream is the source of bytes and must support mark so that
     *                we can peek ahead at its first two bytes
     */
    public ParseByteStream(String systemId, InputStream istream, ParseLog log,
                           String guessedEncoding, ParseHandler handler) throws ParseException, IOException {
        if (log == null) log = DEFAULT_LOG;

        //We need to be able to restart the stream if the declared encoding
        //is different than our guess, os buffer if necessary.  We also need
        //to be able to peek ahead at the first 4 bytes
        if (!istream.markSupported())
            throw new Error(
                    "Precondition violation: the InputStream passed to ParseByteStream must support mark");
        istream.mark(MAXLOOKAHEAD); //mark at begining

        byte[] start = new byte[4];
        int n = istream.read(start);

        if (guessedEncoding == null) guessedEncoding = guessEncoding(systemId, start, n, log);

        try {

            //First try with guessed encoding
            istream.reset();
            InputStreamReader reader = new InputStreamReader(istream, fixEncoding(guessedEncoding));
            try {

                parseSource_ = new ParseCharStream(systemId, reader, log, guessedEncoding, handler);
                //}catch( CharConversionException e ){
            } catch (IOException e) {

                //This exception seems to be caused by reading euc-jp as utf-8
                String secondGuessEncoding = "euc-jp";
                log.note("Problem reading with assumed encoding of " + guessedEncoding
                        + " so restarting with " + secondGuessEncoding, systemId, 1);
                istream.reset();
                try {
                    reader = new InputStreamReader(istream, fixEncoding(secondGuessEncoding));
                } catch (UnsupportedEncodingException ee) {
                    throw new ParseException(log, systemId, 1, '\0', secondGuessEncoding, "\""
                            + secondGuessEncoding + "\" is not a supported encoding");
                }

                parseSource_ = new ParseCharStream(systemId, reader, log, null, handler);
            }
        } catch (EncodingMismatchException e) {
            //if that didn't work try declared encoding
            String declaredEncoding = e.getDeclaredEncoding();
            log.note("Encoding declaration of " + declaredEncoding + " is different that assumed "
                    + guessedEncoding + " so restarting the parsing with the new encoding", systemId, 1);
            istream.reset();
            InputStreamReader reader;
            try {
                reader = new InputStreamReader(istream, fixEncoding(declaredEncoding));
            } catch (UnsupportedEncodingException ee) {
                throw new ParseException(log, systemId, 1, '\0', declaredEncoding, "\"" + declaredEncoding
                        + "\" is not a supported encoding");
            }
            parseSource_ = new ParseCharStream(systemId, reader, log, null, handler);
        }
    }

    public String toString() {
        return parseSource_.toString();
    }

    public String getSystemId() {
        return parseSource_.getSystemId();
    }

    /**
     * Last line number read by parser.
     */
    public int getLineNumber() {
        return parseSource_.getLineNumber();
    }

    /**
     * @link aggregationByValue
     */
    private ParseCharStream parseSource_;

    /////////////////////////////////////////////////////////////////////

    /**
     * Convert byte stream to Unicode character stream according to
     * http://www.w3.org/TR/2000/REC-xml-20001006#sec-guessing
     * .
     */
    static private String guessEncoding(String systemId, byte[] start, int n, ParseLog log)
            throws IOException {
        //Test for UTF-16 byte-order mark
        String encoding;
        if (n != 4) {
            String msg =
                    n <= 0 ? "no characters in input" : "less than 4 characters in input: \""
                            + new String(start, 0, n) + "\"";
            log.error(msg, systemId, 1);
            encoding = "UTF-8";
        } else if (equals(start, 0x0000FEFF) || equals(start, 0xFFFE0000) || equals(start, 0x0000FFFE)
                || equals(start, 0xFEFF0000) || equals(start, 0x0000003C) || equals(start, 0x3C000000)
                || equals(start, 0x00003C00) || equals(start, 0x003C0000))
            encoding = "UCS-4";
        else if (equals(start, 0x003C003F))
            encoding = "UTF-16BE"; //or ISO-10646-UCS-2
        else if (equals(start, 0x3C003F00))
            encoding = "UTF-16LE"; //or ISO-10646-UCS-2
        else if (equals(start, 0x3C3F786D))
            encoding = "UTF-8";//or ISO 646, ASCII, ISO 8859, Shift-JIS, EUC
        else if (equals(start, 0x4C6FA794))
            encoding = "EBCDIC";
        else if (equals(start, (short) 0xFFFE) || equals(start, (short) 0xFEFF))
            encoding = "UTF-16";
        else
            encoding = "UTF-8";

        if (!encoding.equals("UTF-8"))
            log.note("From start " + hex(start[0]) + " " + hex(start[1]) + " " + hex(start[2]) + " "
                    + hex(start[3]) + " deduced encoding = " + encoding, systemId, 1);
        return encoding;
    }

    static private String hex(byte b) {
        String s = Integer.toHexString(b);
        switch (s.length()) {
            case 1:
                return "0" + s;
            case 2:
                return s;
            default:
                return s.substring(s.length() - 2);
        }
    }

    static private boolean equals(byte[] bytes, int integer) {
        return bytes[0] == (byte) ((integer >>> 24)) && bytes[1] == (byte) ((integer >>> 16) & 0xFF)
                && bytes[2] == (byte) ((integer >>> 8) & 0xFF) && bytes[3] == (byte) ((integer) & 0xFF);
    }

    static private boolean equals(byte[] bytes, short integer) {
        return bytes[0] == (byte) ((integer >>> 8)) && bytes[1] == (byte) ((integer) & 0xFF);
    }

    static private String fixEncoding(String encoding) {
        return encoding.toLowerCase().equals("utf8") ? "UTF-8" : encoding;
    }

}


// $Log: ParseByteStream.java,v $
// Revision 1.5  2003/07/28 04:33:04  eobrain
// Fix bug that was removing dashes from unicode encoding names.  We
// should do this only for UTF-8.
//
// Revision 1.4  2003/07/17 23:55:28  eobrain
// Make compatiblie with J2ME.  For example do not use "new"
// java.util classes.
//
// Revision 1.3  2003/01/09 01:05:38  yuhongx
// added FixEncoding().
//
// Revision 1.2  2002/11/06 02:57:59  eobrain
// Organize imputs to removed unused imports.  Remove some unused local variables.
//
// Revision 1.1.1.1  2002/08/19 05:04:00  eobrain
// import from HP Labs internal CVS
//
// Revision 1.14  2002/08/18 04:36:25  eob
// Make interface package-private so as not to clutter up the javadoc.
//
// Revision 1.13  2002/08/17 00:54:14  sermarti
//
// Revision 1.12  2002/08/05 20:04:32  sermarti
//
// Revision 1.11  2002/07/25 21:10:15  sermarti
// Adding files that mysteriously weren't added from Sparta before.
//
// Revision 1.10  2002/05/23 22:00:19  eob
// Add better error handling.
//
// Revision 1.9  2002/05/09 17:02:26  eob
// Fix NullPointerException in error reporting.
//
// Revision 1.8  2002/05/09 16:49:52  eob
// Add history for better error reporting.
//
// Revision 1.7  2002/03/21 23:50:49  eob
// Deprecate functionality moved to Parser facade class.
//
// Revision 1.6  2002/02/15 21:30:38  eob
// Comment changes only.
//
// Revision 1.5  2002/02/01 21:55:15  eob
// Comment change only.
//
// Revision 1.4  2002/01/09 00:45:58  eob
// Formatting change only.
//
// Revision 1.3  2002/01/09 00:44:57  eob
// Handle CharConversionException caused by reading euc-jp characters
// before encoding has been established.  Restart parsing.
//
// Revision 1.2  2002/01/08 19:53:43  eob
// Comment change only.
//
// Revision 1.1  2002/01/08 19:31:33  eob
// Factored out ParseSource functionality into ParseCharStream and
// ParseByteStream.
